library(tidyverse)
library(scales)

setwd("")
us <- read_csv("benchmark_replication_data_2018.csv")

us$tweet_post <- factor(us$tweet_post, levels = rev(c("Several times a day", "About once a day", "3 to 6 days a week", 
                                                      "1 to 2 days a week", "Every few weeks", "Less often", "Never")))
us$tweet_post_political <- factor(us$tweet_post_political, levels = rev(c("Several times a day", "About once a day", "3 to 6 days a week", 
                                                                          "1 to 2 days a week", "Every few weeks", "Less often", "Never")))

us$fb_post <- factor(us$fb_post, levels = rev(c("Several times a day", "About once a day", "3 to 6 days a week", 
                                                "1 to 2 days a week", "Every few weeks", "Less often", "Never")))
us$fb_post_political <- factor(us$fb_post_political, levels = rev(c("Several times a day", "About once a day", "3 to 6 days a week", 
                                                                    "1 to 2 days a week", "Every few weeks", "Less often", "Never")))

### (1) Post Twitter ###

p <- ggplot(filter(us, !is.na(tweet_post)), aes(tweet_post, numtweets)) + geom_point() + geom_boxplot(aes(group = tweet_post)) +
  geom_jitter(width = 0.2, alpha = 0.6) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: All Tweets") + ylab("Actual Number of Tweets Posted") + 
  xlab("Self-Reported Tweet Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p


with(us, tapply(numtweets, tweet_post, function(x) mean(x, na.rm = TRUE)))
with(us, tapply(numpoltweets, tweet_post_political, function(x) mean(x, na.rm = TRUE)))

with(us, cor(numtweets, as.numeric(tweet_post), use = "complete.obs"))
with(us, prop.table(table(tweet_post)))
with(us, prop.table(table(tweet_post_political)))

with(us, cor(numpoltweets, as.numeric(tweet_post_political), use = "complete.obs"))


### (1) Post Twitter -- by Day ###

p <- ggplot(filter(us, !is.na(tweet_post)), aes(tweet_post, numtweetsday)) + geom_point() + geom_boxplot(aes(group = tweet_post)) +
  geom_jitter(width = 0.2, alpha = 0.6) + ylim(0, 105) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: All Tweets") + ylab("Actual Number of Tweets Posted per Day") + 
  xlab("Self-Reported Tweet Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p


with(us, tapply(numtweetsday, tweet_post, function(x) mean(x, na.rm = TRUE))) # Table 3
with(us, prop.table(table(tweet_post)))

with(us, cor(numtweetsday, tweet_post_recoded1, use = "complete.obs"))
with(us, cor(numpoltweetsday, tweet_post_pol_recoded1, use = "complete.obs"))

with(us, prop.table(table(tweet_post_political)))
with(us, tapply(numpoltweetsday, tweet_post_political, function(x) mean(x, na.rm = TRUE)))


### (3) Post Twitter Politics ###

p <- ggplot(filter(us, !is.na(tweet_post_political)), aes(tweet_post_political, numpoltweets)) + geom_point() + geom_boxplot(aes(group = tweet_post_political)) +
  geom_jitter(width = 0.2, alpha = 0.6) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: Political Tweets") + ylab("Estimated Number of Political Tweets Posted") + 
  xlab("Self-Reported Political Tweet Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p


### (3) Post Twitter Politics -- by Day ###

p <- ggplot(filter(us, !is.na(tweet_post_political)), aes(tweet_post_political, numpoltweetsday)) + geom_point() + geom_boxplot(aes(group = tweet_post_political)) +
  geom_jitter(width = 0.2, alpha = 0.6) + ylim(0, 105) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: Political Tweets") + ylab("Estimated Number of Political Tweets Posted per Day") + 
  xlab("Self-Reported Political Tweet Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p


tapply(us$numtweetsday, us$tweet_post, function(x) mean(x, na.rm = TRUE)) # Table 3
tapply(us$numpoltweetsday, us$tweet_post_political, function(x) mean(x, na.rm = TRUE)) # Table 3




### FB ###


### (2) Post FB ###


p <- ggplot(filter(us, !is.na(fb_post)), aes(fb_post, numposts)) + geom_point() + geom_boxplot(aes(group = fb_post)) +
  geom_jitter(width = 0.2, alpha = 0.6) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: All Facebook Posts") + ylab("Actual Number of Posts") + 
  xlab("Self-Reported Post Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p


with(us, tapply(numposts, fb_post, function(x) mean(x, na.rm = TRUE)))
with(us, cor(numposts, as.numeric(fb_post), use = "complete.obs"))
with(us, cor(numpolposts, as.numeric(fb_post_political), use = "complete.obs"))
with(us, prop.table(table(fb_post)))
with(us, prop.table(table(fb_post_political)))


### (2) Post FB -- by Day ###


p <- ggplot(filter(us, !is.na(fb_post)), aes(fb_post, numpostsday)) + geom_point() + geom_boxplot(aes(group = fb_post)) +
  geom_jitter(width = 0.2, alpha = 0.6) + ylim(0, 3.75) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: All Facebook Posts") + ylab("Actual Number of Posts per Day") + 
  xlab("Self-Reported Post Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p




### (4) Post FB Politics ###

p <- ggplot(filter(us, !is.na(fb_post_political)), aes(fb_post_political, numpolposts)) + geom_point() + geom_boxplot(aes(group = fb_post_political)) +
  geom_jitter(width = 0.2, alpha = 0.6) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: Political Facebook Posts") + ylab("Estimated Number of Political Posts") + 
  xlab("Self-Reported Political Post Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p


### (4) Post FB Politics -- per Day ###

p <- ggplot(filter(us, !is.na(fb_post_political)), aes(fb_post_political, numpolpostsday)) + geom_point() + geom_boxplot(aes(group = fb_post_political)) +
  geom_jitter(width = 0.2, alpha = 0.6) + ylim(0, 3.75) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: Political Facebook Posts") + ylab("Estimated Number of Political Posts per Day") + 
  xlab("Self-Reported Political Post Frequency") + theme_bw()

p <- p + theme(panel.border=element_blank(), 
               axis.ticks.y=element_blank(), axis.ticks.x=element_blank(),
               axis.text.x = element_text(angle = 5)) +labs(title = NULL)

p


### (5) Twitter friends


p <- ggplot(filter(us, !is.na(twitter_friends)), aes(twitter_friends, numfriends)) + geom_point() + geom_boxplot(aes(group = twitter_friends)) +
  geom_jitter(width = 0.2, alpha = 0.6) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: Friends on Twitter") + ylab("Actual Number of Accounts Followed on Twitter") + 
  xlab("Self-Reported Number of Friends") #+ theme(legend.position = "bottom")

p <- p + theme(axis.text.x = element_text(angle = 5)) + labs(title = NULL)
p


### (6) twitter_politicians_follow


p <- ggplot(filter(us, !is.na(twitter_politicians_follow)), aes(twitter_politicians_follow, numpolfollow)) + geom_point() + geom_boxplot(aes(group = twitter_politicians_follow)) +
  geom_jitter(width = 0.2, alpha = 0.6) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: Politicians Followed on Twitter") + ylab("Actual Number of Politicians Followed on Twitter") + 
  xlab("Self-Reported Politicians Followed") #+ theme(legend.position = "bottom")

p <- p + theme(axis.text.x = element_text(angle = 5)) + labs(title = NULL)
p

tapply(us$numpolfollow, us$twitter_politicians_follow, function(x) mean(x, na.rm = TRUE))
prop.table(table(us$twitter_politicians_follow))

### (7) fb_politicians_follow

p <- ggplot(filter(us, !is.na(fb_politicians_follow)), aes(fb_politicians_follow, numpol)) + geom_point() + geom_boxplot(aes(group = fb_politicians_follow)) +
  geom_jitter(width = 0.2, alpha = 0.6) + #, outlier.alpha = 0.1
  ggtitle("Survey Benchmarking: Politicians Followed on Facebook") + ylab("Actual Number of Politicians Followed on Facebook") + 
  xlab("Self-Reported Politicians Followed") #+ theme(legend.position = "bottom")

p <- p + theme(axis.text.x = element_text(angle = 5)) + labs(title = NULL)
p

tapply(us$numpol, us$fb_politicians_follow, function(x) mean(x, na.rm = TRUE))
prop.table(table(us$fb_politicians_follow))


###### ERRORS #######


# Twitter all

# us$tweet_post_recoded1 <- c(0, 1/50, 1/21, 2/7, 6/7, 1, 3)[us$tweet_post]
# us$tweet_post_recoded2 <- c(0, 1/40, 1/21, 1/7, 3/7, 1, 3)[us$tweet_post]
# us$tw_all_err <- ifelse(abs(us$tweet_post_recoded1 - us$numtweetsday) < abs(us$tweet_post_recoded2 - us$numtweetsday), us$tweet_post_recoded1 - us$numtweetsday, us$tweet_post_recoded2 - us$numtweetsday)

err.1 <- lm(tw_all_err ~ voted_primary + pid3 + partystr + age + educ + as.factor(gender) + #as.factor(faminc) + 
              nonwhite, data = us)
summary(err.1)


# Twitter politics

# us$tweet_post_pol_recoded1 <- c(0, 1/50, 1/21, 2/7, 6/7, 1, 3)[us$tweet_post_political]
# us$tweet_post_pol_recoded2 <- c(0, 1/40, 1/21, 1/7, 3/7, 1, 3)[us$tweet_post_political]
# us$tw_pol_err <- ifelse(abs(us$tweet_post_pol_recoded1 - us$numtweetsday) < abs(us$tweet_post_pol_recoded2 - us$numtweetsday), us$tweet_post_pol_recoded1 - us$numtweetsday, us$tweet_post_pol_recoded2 - us$numtweetsday)

err.2 <- lm(tw_pol_err ~ voted_primary + pid3 + partystr + age + educ + as.factor(gender) + #as.factor(faminc) + 
              nonwhite, data = us)
summary(err.2)


# Facebook all

err.3 <- lm(fb_all_err ~ voted_primary + pid3 + partystr + age + educ + as.factor(gender) + #as.factor(faminc) + 
              nonwhite, data = us)
summary(err.3)



# Facebook politics

err.4 <- lm(fb_pol_err ~ voted_primary + pid3 + partystr + age + educ + as.factor(gender) + #as.factor(faminc) + 
              nonwhite, data = us)
summary(err.4)




library(stargazer)

stargazer(err.1, err.2, err.3, err.4,
          style = "apsr", align = TRUE, column.sep.width = "0pt", 
          title = "Predictors of Individual-level Social Media Reporting Errors",
          covariate.labels = c("Voted in primary", "Democrat", "Republican", "Independent", "Not sure", "Strong partisan", "Age", 
                               "High school",  "Some college", "2-year college", "College graduate", "Postgrad", "Female", "Nonwhite", "Constant"),
          #omit = "age|nonwhite|female|college|income",
          #omit.labels = "Covariates", omit.yes.no = c("Yes", "No"),
          dep.var.labels = c("Tweets/day (All)", "Tweets/day (Pol)", "FB/day (All)", "FB/day (Pol)"), 
          omit.stat = c("f", "ser", "rsq"), notes = "OLS models. Reference category for party is ``Other.''")


### Table 1

xtable::xtable(t(rbind(table(us$tweet_post), table(us$tweet_post_political),
                       table(us$fb_post), table(us$fb_post_political))))
round(t(rbind(prop.table(table(us$tweet_post)), prop.table(table(us$tweet_post_political)),
              prop.table(table(us$fb_post)), prop.table(table(us$fb_post_political)))), digits = 2)

xtable::xtable(t(rbind(table(us$twitter_politicians_follow), table(us$fb_politicians_follow))))
xtable::xtable(t(rbind(prop.table(table(us$twitter_politicians_follow)), prop.table(table(us$fb_politicians_follow)))))



### Attrition (Appendix B)

table(us$twitter_yes, us$text_in_twitter_box)
table(us$text_in_twitter_box, us$could_scrape)
table(us$could_scrape, us$have_tweets)

table(!is.na(us$numposts))
